 

/*** Logit on Who moves ****/
if $r_selection == 1{
    //load_data ml , ignore_p_move
    use ml_cross_sectional.dta , replace

    replace foundation_year = 0 if inlist(foundation_year,1988,1990,2012,2013,2014)
    
     local sector cleantech comms semicond internet life_science medical_dev misc 
    eststo clear
    eststo: logit us_hq  ln_first_round , or cluster(foundation_year)
    eststo: logit us_hq   n_ssfp, or cluster(foundation_year)
    eststo: logit us_hq    has_pat has_trademark, or cluster(foundation_year)    

    eststo: logit us_hq   ln_first_round  n_ssfp  has_pat has_trademark `sector'  i.foundation_year , or cluster(foundation_year)

    eststo: logit us_hq  ln_first_round n_ssfp  has_pat has_trademark us_vc_r1 `sector' i.foundation_year , or cluster(foundation_year)
    
    esttab, label se order(ln_first_round n_ssfp has_pat has_trademark us_vc_r1 `sector') indicate(" Year F.E. = *year")
    
    
    esttab using tex/logit_predictors_of_moving.tex ,  order(ln_first_round n_ssfp has_pat has_trademark us_vc_r1 `sector') indicate("Year F.E.=*year") label se pr2 scalar("ll Log Likelihood")  nomtitles replace prehead("\begin{table} \centering" "\begin{threeparttable}" "\def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}" "\caption{\large Who migrates? Determinants of Israeli startup migration to the US.  \\ Logit regressions. D.V.: Moves to US}" "\begin{tabular}{l*{@M}{cccccc}} \hline \hline" )	  postfoot("\hline\hline" "\end{tabular}" " \begin{tablenotes}" "\item “\emph{Notes}: We report the results from estimating logit models for the likelihood that an Israeli startup establishes its headquarters in the US. The regressors of interest are measures for a startup's performance potential. To build the patent and trademark indicators, we only consider patents and trademarks that were applied for during the founding year or the year after. We report incidence-rate ratios (IRRs). Ratios greater than one imply that an increase in the value of a given regressor leads to a higher likelihood that an outcome occurs, with the opposite for ratios less than one. Standard errors are clustered at the founding-year level to account for the possibility that the attractiveness of the US market to Israeli startups might have changed over time. Significance denoted as: * p \textless 0.10, ** p \textless 0.05, *** p \textless 0.01. " "\end{tablenotes}" "\end{threeparttable}"  "\end{table}") nocons eform eqlabels(none)
    
}



/***  Machine learning estimates ***/
if $r_machine_learning == 1{
    clear
    gen p_move = .
    save all_iterations__p_moves.dta , replace

    clear
    gen roc_full = .
    save all_iterations__roc_scores.dta , replace

    clear
    gen feature_importance = .
    save all_iterations__features.dta , replace

    //load_data ml , ignore_p_move
    use ml_cross_sectional.dta , replace

    do selected_ml.do

    if "$ml_us_hq" == "" {
         drop ml*ocs*
         drop  ml*south* ml*jerusalem* ml*westb* ml*center* ml*north* ml*haifa* ml*semicond* ml*misc* ml*medic* ml*medc* ml*mdc* ml*life* ml*internet* ml*_it* ml*comm* ml*cleant*
             
         lassoShooting us_hq ml_* yr_* ,  lasiter(100) verbose(0)
         file open f using selected_ml.do , write append
         di "Selected: `r(selected)'"
         file write f _n "global ml_us_hq `r(selected)'" _n
         file close f
         do selected_ml.do
     }

     
    local vars_us_hq $ml_us_hq
        
    forvalues i = 1/50 {
        di "In Iteration : `i'"
        //load_data ml , ignore_p_move
        use ml_cross_sectional.dta , replace
        
        gen  rsort = runiform()
        sort rsort
    
        gen train_index = _n/_N < .6
        gen pred_index  = 1 - train_index 
        randomforest us_hq `vars_us_hq', train_index(train_index) predict_index(pred_index) gen(ml_prob_move) store_roc(us_hq_roc_graph.png) store_features(us_hq_features.dta)
       
        /** Store all the new values **/

        rename ml_prob_move p_move 
        keep if pred_index == 1  /** Use only the out of sample predictioons for all **/
        keep p_move id_company
        duplicates drop id_company, force
        gen iteration = `i'
        append using all_iterations__p_moves.dta
        save all_iterations__p_moves.dta , replace

        clear
        use ~/temp/forestfile_rocscores.dta
        gen iteration = `i'
        append using all_iterations__roc_scores.dta
        save all_iterations__roc_scores.dta , replace

        
        clear
        use us_hq_features.dta
        gen iteration = `i'
        append using all_iterations__features.dta
        save all_iterations__features.dta , replace
    }

    /** End of bootstrap loop, now create the graphs **/

    //load_data ml , ignore_p_move
    use ml_cross_sectional.dta , replace
    
    if "$ml_acq_ipo" == "" {
        lassoShooting acq_ipo ml_* yr_*,  lasiter(100) verbose(0) 
        file open f using selected_ml.do , write append
        file write f _n "global ml_acq_ipo `r(selected)'" _n
        file close f
        do selected_ml.do
    }

    clear
    use all_iterations__p_moves.dta
    collapse (mean) p_move , by(id_company)
    save p_moves.dta , replace
                                
    //load_data flat
    use ml_cross_sectional.dta , replace
    merge 1:1 id_company using p_moves.dta
    sort p_move 
    gen p = floor((_n-1)/_N*20)/20
    
    set scheme s1mono 
    /** Comment: The predictors of migration are also good predictors of success **/
    graph bar (mean) acq_ipo  , over(p, label(angle(45))) saving(a.gph, replace) title("A. All startups", size(small)) ytitle("Share of startups that were acquired or went IPO") 
    graph bar (mean)  acq_ipo if us_hq ==0, over(p, label(angle(90))) saving(b.gph, replace) title("B. Non-migrants", size(small)) ytitle("Share of startups that were acquired or went IPO") 
    graph combine a.gph b.gph, rows(1) iscale(.6) title("") ycommon 
    graph export distribution_of_movers_by_p_move.eps, replace


    clear
    use all_iterations__roc_scores.dta
    gen rounded_pred = round(roc_prediction,.01)
    gen num = 1
    graph bar (sum) num, over(rounded_pred) title("Distribution of ROC scores") ytitle("Number of Models") 

    graph export distribution_of_roc_scores.eps, replace


    set scheme s1mono
    
    //load_data flat
    use ml_cross_sectional.dta , replace
    sum p_move if us_hq == 1
    local xmin `r(min)'

    sum p_move if strategic == 1
    local xmax `r(max)'

    twoway (kdensity p_move, bwidth(.025) lpattern(dot)) (kdensity p_move if us_hq == 1, bwidth(.025) lpattern(solid)) (kdensity p_move if strategic == 1, bwidth(.025) lpattern(longdash)), legend(label(1 "All Startups") label(2 "Migrants") label(3 "Exogenous non-migrants")) title("") ytitle("density") xtitle("Predicted probability of migrating to the US estimated by our random forest model") xline(`xmin' `xmax') text( 13 `xmax' "<- region of common support" , placement(e) size(vsmall))
    graph export p_move_histograms.eps, replace
    
}    


if $r_store_ml_features==1 {
    clear
    use ../2_Dataset/startups_dataset.dta
    keep investor_id investor_name private_investor
    duplicates drop
    drop if investor_id == .
    save investors.dta , replace
    
    clear
    use all_iterations__features.dta
    gen num_used = 1
    collapse (mean) index feature_importance (sum) num_used (sd) sd_importance=feature_importance (p5) p5_importance = feature_importance (p95) p95_importance = feature_importance,by(feature_name)
    
    gen investor_id = subinstr(feature_name,"ml_investor_id_","",.) if strpos(feature_name , "ml_investor_id")
    destring investor_id, replace
    merge m:1 investor_id using investors.dta
    drop if _merge == 2
    drop _merge

    

    
    /** TODO: Create more detailed table **/
    gen output_name = feature_name + " " + investor_name
    replace output_name = "Invested by: " + investor_name if strpos(feature_name, "ml_investor_id")
    
    replace output_name = subinstr(output_name,"_"," ",.)
    replace output_name = substr(output_name, 1, 50)
    label variable output_name "Feature"
    label variable feature_importance "Importance"
    label variable index "Order"
    gsort -feature_importance

    format feature_importance %9.2g
    format sd_importance %9.2g

    replace index = _n
    list index output_name feature_importance sd_importance
   
    listtex index  output_name feature_importance sd_importance if _n <= 50 using "tex/us_hq_features.tex",  replace type rstyle(tabular)       head("\begin{tabular}{rrrr}"   `"Order & Feature Name & Mean Importance & Std. Dev.\\"') foot("\end{tabular}")
}



if $r_balance_test_experiment == 1 {
    //load_data ml
    use ml_cross_sectional.dta , replace
    label variable ml_n_vc "Number of VC Investors First Round"
    label variable ml_n_vc "Number of VC Investors First Round"
    
    

    balancetable us_hq     univ_tto univ_spin   has_pat has_trademark us_vc_r1      if (strategic | us_hq == 1) & in_p_move_range using "tex/balancetable.tex" , ctitles("Non-Migrants (Control)" "Migrants (Treated)" "Difference")  replace varlabels



}




if $r_reviewer_ml_with_subsector == 1 {
    clear
    gen roc_full = .
    save reviewer_roc_scores.dta , replace

    clear
    gen p_move = .
    save reviewer_p_moves.dta , replace

    clear
    gen feature_importance = .
    save reviewer_features.dta , replace


    //load_data ml
    use ml_cross_sectional.dta , replace
    tab company_subsector , gen(mlsub_)


    do selected_ml.do
    if "$ml_hq_subsectors" == "" {

        drop ml*ocs*
        drop  ml*south* ml*jerusalem* ml*westb* ml*center* ml*north* ml*haifa* ml*semicond* ml*misc* ml*medic* ml*medc* ml*mdc* ml*life* ml*internet* ml*_it* ml*comm* ml*cleant*
    
        lassoShooting us_hq ml_* yr_* mlsub_* ,  lasiter(100) verbose(0)
        
        file open f using selected_ml.do , write append
        di "Selected: `r(selected)'"
        file write f _n "global ml_hq_subsectors `r(selected)'" _n
        file close f
        do selected_ml.do
     }

     

    local vars_subsector $ml_hq_subsectors


    forvalues i = 1/50 {
        di "In Iteration : `i'"
        //load_data ml
        use ml_cross_sectional.dta , replace
        tab company_subsector , gen(mlsub_)
        
    
        gen  rsort = runiform()
        sort rsort
    
        gen train_index = _n/_N < .6
        gen pred_index  = 1 - train_index 
        randomforest us_hq `vars_subsector', train_index(train_index) predict_index(pred_index) gen(ml_prob_move) store_roc(us_hq_roc_graph_subsector.png) store_features(us_hq_features.dta)


        safedrop p_move
        rename ml_prob_move p_move 
        keep if pred_index == 1
        keep p_move firm_id
        duplicates drop firm_id, force
        gen iteration = `i'
        append using reviewer_p_moves.dta
        save reviewer_p_moves.dta , replace



        clear
        use ~/temp/forestfile_rocscores.dta
        gen iteration = `i'
        append using reviewer_roc_scores.dta
        save reviewer_roc_scores.dta , replace

        
        clear
        use us_hq_features.dta
        gen iteration = `i'
        append using reviewer_features.dta
        save reviewer_features.dta , replace
    }


    clear
    use reviewer_roc_scores.dta
    gen rounded_pred = round(roc_prediction,.01)
    gen num = 1
    graph bar (sum) num, over(rounded_pred) title("Distribution of ROC scores of subsector model") ytitle("Number of Models") 

    graph export subsector_roc_scores.eps, replace
}


if $r_reviewer_ml_subsector_features == 1 {
    clear
    //load_data ml
    use ml_cross_sectional.dta , replace
    gen subsector_code = company_subsector
    gen varname = "mlsub_" + string(subsector_code)
    keep varname company_subsector subsector_code
    duplicates drop
    save subsectors.dta , replace
    

    clear
    use reviewer_features.dta
    gen num_used = 1
    collapse (mean) index feature_importance (sum) num_used (sd) sd_importance=feature_importance (p5) p5_importance = feature_importance (p95) p95_importance = feature_importance,by(feature_name)
    
    gen investor_id = subinstr(feature_name,"ml_investor_id_","",.) if strpos(feature_name , "ml_investor_id")
    destring investor_id, replace
    merge m:1 investor_id using investors.dta
    drop if _merge == 2
    drop _merge


    gen varname = feature_name
    merge m:1 varname using subsectors.dta
    drop if _merge == 2
    drop _merge


    
    /** TODO: Create more detailed table **/
/*    gen output_name = feature_name + " " + investor_name
    replace output_name = "Invested by: " + investor_name if strpos(feature_name, "ml_investor_id")

    replace output_name = "Subsector: " + company_subsector if strpos(feature_name,"mlsub")
    replace output_name = subinstr(output_name,"_"," ",.)
    replace output_name = substr(output_name, 1, 50)
    label variable output_name "Feature"
    label variable feature_importance "Importance"
    label variable index "Order"
    gsort -feature_importance

    format feature_importance %9.2g
    format sd_importance %9.2g

    replace index = _n
    list index output_name feature_importance sd_importance
   
    listtex index  output_name feature_importance sd_importance if _n <= 50 using "tex/reviewer.subsector_features.tex",  replace type rstyle(tabular)       head("\begin{tabular}{rrrr}"   `"Order & Feature Name & Mean Importance & Std. Dev.\\"') foot("\end{tabular}")
*/


}    
